Load Libraries

library(tidyverse)
library(GGally)
library(cowplot)
library(class)
library(caret)
library(e1071)
library(reshape2)
library(stringr)

Load and look at the data

attrition.df <- read.csv("CaseStudy2-data.csv", header = T)
str(attrition.df)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : chr  "No" "No" "No" "No" ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : chr  "Sales" "Research & Development" "Research & Development" "Sales" ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : chr  "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : chr  "Divorced" "Single" "Single" "Married" ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "No" "No" ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
noSalary <- read.csv("CaseStudy2CompSet(NoSalary).csv", header = T)
str(noSalary)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  871 872 873 874 875 876 877 878 879 880 ...
##  $ Age                     : int  43 33 55 36 27 39 33 21 30 51 ...
##  $ Attrition               : chr  "No" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr  "Travel_Frequently" "Travel_Rarely" "Travel_Rarely" "Non-Travel" ...
##  $ DailyRate               : int  1422 461 267 1351 1302 895 750 251 1312 1405 ...
##  $ Department              : chr  "Sales" "Research & Development" "Sales" "Research & Development" ...
##  $ DistanceFromHome        : int  2 13 13 9 19 5 22 10 23 11 ...
##  $ Education               : int  4 1 4 4 3 3 2 2 3 2 ...
##  $ EducationField          : chr  "Life Sciences" "Life Sciences" "Marketing" "Life Sciences" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1849 995 1372 1949 1619 42 160 1279 159 1367 ...
##  $ EnvironmentSatisfaction : int  1 2 1 1 4 4 3 1 1 4 ...
##  $ Gender                  : chr  "Male" "Female" "Male" "Male" ...
##  $ HourlyRate              : int  92 53 85 66 67 56 95 45 96 82 ...
##  $ JobInvolvement          : int  3 3 4 4 2 3 3 2 1 2 ...
##  $ JobLevel                : int  2 1 4 1 1 2 2 1 1 4 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Scientist" "Sales Executive" "Laboratory Technician" ...
##  $ JobSatisfaction         : int  4 4 3 2 1 4 2 3 3 2 ...
##  $ MaritalStatus           : chr  "Married" "Single" "Single" "Married" ...
##  $ MonthlyRate             : int  19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
##  $ NumCompaniesWorked      : int  1 3 6 1 1 3 0 1 1 3 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "Yes" "No" ...
##  $ PercentSalaryHike       : int  20 18 17 22 11 14 13 20 25 16 ...
##  $ PerformanceRating       : int  4 3 3 4 3 3 3 4 4 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 2 1 3 1 3 3 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 0 2 1 1 0 3 0 ...
##  $ TotalWorkingYears       : int  7 5 24 5 7 19 8 2 10 29 ...
##  $ TrainingTimesLastYear   : int  5 4 2 3 3 6 2 2 2 1 ...
##  $ WorkLifeBalance         : int  3 3 2 3 3 4 4 1 2 2 ...
##  $ YearsAtCompany          : int  7 3 19 5 7 1 7 2 10 5 ...
##  $ YearsInCurrentRole      : int  7 2 7 4 7 0 7 2 7 2 ...
##  $ YearsSinceLastPromotion : int  7 0 3 0 0 0 0 2 0 0 ...
##  $ YearsWithCurrManager    : int  7 2 8 2 7 0 7 2 9 3 ...

Attrition EDA

Attrition Dependent on Marital Status

# divorce contribute to attrition
bar <- ggplot(attrition.df, aes(x=MaritalStatus, fill = MaritalStatus)) +
  geom_bar() +
  facet_wrap(~Attrition) +
  ggtitle("Attrition Dependent on Marital Status ") +
  xlab("Attrition") +
  ylab("Count")

# Yes Attrition
# make new data frames for married, divorced, and single
df1 <- attrition.df %>%
  filter(Attrition == "Yes")
df1length <- NROW(df1)
df1married <- length(grep("Married", df1$MaritalStatus))
df1divorced <- length(grep("Divorced", df1$MaritalStatus))
df1single <- length(grep("Single", df1$MaritalStatus))

Yes_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"), 
                     Value = c(df1married,df1divorced,df1single))

plot1 <- ggplot(Yes_df, aes(x="", y=Value, fill=MaritalStatus)) +
  geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: Yes")
pie1 <- plot1 + coord_polar("y", start=0)

# No Attrition
# make new data frames for married, divorced, and single
df2 <- attrition.df %>%
  filter(Attrition == "No")
df2length <- NROW(df2)
df2married <- length(grep("Married", df2$MaritalStatus))
df2divorced <- length(grep("Divorced", df2$MaritalStatus))
df2single <- length(grep("Single", df2$MaritalStatus))

No_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"), 
                     Value = c(df2married,df2divorced,df2single))

plot2 <- ggplot(No_df, aes(x="", y=Value, fill=MaritalStatus)) +
  geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: No")
pie2 <- plot2 + coord_polar("y", start=0)

piecharts <- plot_grid(pie2,pie1, ncol = 2,  labels = c("B","C"))
allplots <- plot_grid(bar,piecharts, nrow = 2, labels = "A")
allplots

Monthly Income

ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
  geom_histogram() +
  ggtitle("Monthly Income Based on Attrition") +
  xlab("Monthly Income") +
  ylab("Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Monthly Income Based on Job Role

ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
  geom_histogram() +
  facet_wrap(~JobRole) +
  ggtitle("Monthly Income Based on Job Role") +
  xlab("Monthly Income") +
  ylab("Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Percentage of Attrition Based on Multiple Variables

EDA1 <- ggplot(attrition.df, aes(x=RelationshipSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  #facet_wrap(~JobRole) +
  ggtitle("Relationship Satisfaction") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA2 <- ggplot(attrition.df, aes(x=JobLevel, fill = Attrition)) +
  geom_bar(position = "fill") +
  #facet_wrap(~JobRole) +
  ggtitle("Job Level") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA3 <- ggplot(attrition.df, aes(x=JobSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Job Satisfaction") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA4 <- ggplot(attrition.df, aes(x=YearsWithCurrManager, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years With Current Manager") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA5 <- ggplot(attrition.df, aes(x=YearsSinceLastPromotion, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years Since Last Promotion") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA6 <- ggplot(attrition.df, aes(x=YearsAtCompany, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years At Company") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA7 <- ggplot(attrition.df, aes(x=YearsInCurrentRole, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years in Current Role") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA8 <- ggplot(attrition.df, aes(x=NumCompaniesWorked, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Companies Worked") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA9 <- ggplot(attrition.df, aes(x=MaritalStatus, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Marital Status") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none")

EDA10 <- plot_grid(EDA1,EDA2,EDA3,EDA4,EDA5,EDA6,EDA7,EDA8,EDA9, ncol = 3, nrow = 3)
EDA10

Scatter Plots with Continuous Variables

EDA.A <- ggplot(attrition.df, aes(x=TotalWorkingYears, y=PercentSalaryHike, color = Attrition)) +
  geom_point() +
  ggtitle("Total Working Years vs Percent Salary Hike")

EDA.B <- ggplot(attrition.df, aes(x=YearsWithCurrManager, y=PercentSalaryHike, color = Attrition)) +
  geom_point() +
  ggtitle("Years With Current Manager vs Percent Salary Hike")

EDA.C <-ggplot(attrition.df, aes(x=YearsAtCompany, y=YearsInCurrentRole, color = Attrition)) +
  geom_point() +
  ggtitle("Years At Company vs Years In CurrentRole")

EDA.D <- ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
  geom_point() +
  ggtitle("Age vs Monthly Income")

EDA.E <- ggplot(attrition.df, aes(x=Age, y=YearsSinceLastPromotion, color = Attrition)) +
  geom_point() +
  ggtitle("Age vs Job YearsSinceLastPromotion")

EDA.F <- ggplot(attrition.df, aes(x=Age, y=PercentSalaryHike, color = Attrition)) +
  geom_point() +
  ggtitle("Age vs Percent Salary Hike")

EDA.G <- plot_grid(EDA.A,EDA.B,EDA.C,EDA.D,EDA.E,EDA.F, ncol = 2, nrow = 3)
EDA.G

GGpairs Plots

ggpairs(attrition.df, columns = c(2,25,35,36,3), aes(color = Attrition))

Attrition Analysis by Job Role

ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
  geom_point() +
  facet_wrap(~JobRole)

Models for Attrition

k-NN Model (Train and Test Sets)

Conclusion: This model does not have enough “Yes” attrition data to be able to use. Internal may prove more able to handle this skewed data.

set.seed(9)
splitPerc <- .85
trainIndices <- sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
dfTrain <- attrition.df[trainIndices,]
dfTest <- attrition.df[-trainIndices,]
dfTrain <- na.omit(dfTrain)
dfTest <- na.omit(dfTest)

# 2:Age, 18:JobSatisfaction, 20:MontlyIncome, 25:PercentSalaryHike, 
# 30:TotalWorkingYears, 30:TotalworkingYears, 33:YearsAtCompany, 
# 34:YearsInCurrentRole, 35:YearsSinceLastPromotion, 36:YearsWithCurrManager

# knn model 
classifications <- knn(dfTrain[,c(2,35)], dfTest[,c(2,35)], dfTrain$Attrition, 
                       prob = TRUE, k = 10)
table(dfTest$Attrition,classifications)
##      classifications
##        No Yes
##   No  106   0
##   Yes  22   2
confusionMatrix(table(dfTest$Attrition,classifications))
## Confusion Matrix and Statistics
## 
##      classifications
##        No Yes
##   No  106   0
##   Yes  22   2
##                                           
##                Accuracy : 0.8308          
##                  95% CI : (0.7551, 0.8908)
##     No Information Rate : 0.9846          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1291          
##                                           
##  Mcnemar's Test P-Value : 7.562e-06       
##                                           
##             Sensitivity : 0.82812         
##             Specificity : 1.00000         
##          Pos Pred Value : 1.00000         
##          Neg Pred Value : 0.08333         
##              Prevalence : 0.98462         
##          Detection Rate : 0.81538         
##    Detection Prevalence : 0.81538         
##       Balanced Accuracy : 0.91406         
##                                           
##        'Positive' Class : No              
## 

Loop for many k

# Loop for many k and the average of many training / test partition
iterations = 100
numks = 50

masterAcc = matrix(nrow = iterations, ncol = numks)

for(j in 1:iterations)
{
  accs = data.frame(accuracy = numeric(100), k = numeric(100))
  trainIndices = sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
  train = attrition.df[trainIndices,]
  test = attrition.df[-trainIndices,]
  train = na.omit(train)
  test = na.omit(test)
  for(i in 1:numks)
  {
    classifications = knn(train[,c(2,35)],test[,c(2,35)],train$Attrition, prob = TRUE, k = i)
    table(classifications,test$Attrition)
    CM = confusionMatrix(table(classifications,test$Attrition))
    masterAcc[j,i] = CM$overall[1]
  }
  
}

MeanAcc = colMeans(masterAcc)

plot(seq(1,numks,1),MeanAcc, type = "l", xlab = "k", ylab = "Mean Accuracy", main = "Mean Accuracy of k Values")

Internal k-NN Model

# Internal Model
classifications1 <- knn.cv(dfTrain[,c(2,35)],dfTrain$Attrition, k = 20)
confusionMatrix(table(classifications1,dfTrain$Attrition))
## Confusion Matrix and Statistics
## 
##                 
## classifications1  No Yes
##              No  618 107
##              Yes   6   9
##                                           
##                Accuracy : 0.8473          
##                  95% CI : (0.8193, 0.8725)
##     No Information Rate : 0.8432          
##     P-Value [Acc > NIR] : 0.4044          
##                                           
##                   Kappa : 0.1053          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99038         
##             Specificity : 0.07759         
##          Pos Pred Value : 0.85241         
##          Neg Pred Value : 0.60000         
##              Prevalence : 0.84324         
##          Detection Rate : 0.83514         
##    Detection Prevalence : 0.97973         
##       Balanced Accuracy : 0.53399         
##                                           
##        'Positive' Class : No              
## 

Naive Bayes

NBmodel <- naiveBayes(dfTrain[,c(24,20)],dfTrain$Attrition,laplace = 1)
table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition)
##      
##        No Yes
##   No  106  24
##   Yes   0   0
confusionMatrix(table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  106  24
##   Yes   0   0
##                                          
##                Accuracy : 0.8154         
##                  95% CI : (0.7379, 0.878)
##     No Information Rate : 0.8154         
##     P-Value [Acc > NIR] : 0.5543         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : 2.668e-06      
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.8154         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.8154         
##          Detection Rate : 0.8154         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : No             
## 
NBmodel1 = naiveBayes(Attrition~.,data = dfTrain)
table(predict(NBmodel1,dfTest))
## 
##  No Yes 
## 102  28
confusionMatrix(table(predict(NBmodel1,dfTest),dfTest$Attrition))
## Confusion Matrix and Statistics
## 
##      
##       No Yes
##   No  96   6
##   Yes 10  18
##                                          
##                Accuracy : 0.8769         
##                  95% CI : (0.8078, 0.928)
##     No Information Rate : 0.8154         
##     P-Value [Acc > NIR] : 0.0401         
##                                          
##                   Kappa : 0.616          
##                                          
##  Mcnemar's Test P-Value : 0.4533         
##                                          
##             Sensitivity : 0.9057         
##             Specificity : 0.7500         
##          Pos Pred Value : 0.9412         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.8154         
##          Detection Rate : 0.7385         
##    Detection Prevalence : 0.7846         
##       Balanced Accuracy : 0.8278         
##                                          
##        'Positive' Class : No             
## 

New Dataframe for Multiple Varables in Models

# Make new dataframe and make train and test sets
set.seed(9)
new.df <- attrition.df[,c(2,3,4,5,6,7,15,16,18,20,29,30,32,33,34,35)]
splitPerc <- .7
trainIndices <- sample(1:dim(new.df)[1],round(splitPerc * dim(new.df)[1]))
dfTrain1 <- new.df[trainIndices,]
dfTest1 <- new.df[-trainIndices,]
dfTrain1 <- na.omit(dfTrain1)
dfTest1 <- na.omit(dfTest1)

Run Model with New Datframe

# Naive Bayes Model for New Dataframe
new.dfmodel = naiveBayes(Attrition~.,data = dfTrain1)
confusionMatrix(table(predict(new.dfmodel,dfTest1),dfTest1$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  206  20
##   Yes  17  18
##                                           
##                Accuracy : 0.8582          
##                  95% CI : (0.8099, 0.8982)
##     No Information Rate : 0.8544          
##     P-Value [Acc > NIR] : 0.4733          
##                                           
##                   Kappa : 0.4109          
##                                           
##  Mcnemar's Test P-Value : 0.7423          
##                                           
##             Sensitivity : 0.9238          
##             Specificity : 0.4737          
##          Pos Pred Value : 0.9115          
##          Neg Pred Value : 0.5143          
##              Prevalence : 0.8544          
##          Detection Rate : 0.7893          
##    Detection Prevalence : 0.8659          
##       Balanced Accuracy : 0.6987          
##                                           
##        'Positive' Class : No              
## 

Linear Regression

Attrition Linear Regression

Not sure if this is actually correct to use for this kind of data

# change attrition variable to numberic for use in linear regression model
attrition.lm <- attrition.df
attrition.lm$Attrition <- gsub("Yes", 1, attrition.lm$Attrition)
attrition.lm$Attrition <- gsub("No", 0, attrition.lm$Attrition)
attrition.lm$Attrition <- as.numeric(attrition.lm$Attrition)

# run model
attritionfit = lm(Attrition~Age*MonthlyIncome, data = attrition.lm)
attritionfit1 = lm(Attrition~Age+MonthlyIncome, data = attrition.lm)
attritionfit2 = lm(Attrition~MonthlyIncome, data = attrition.lm)
attritionfit3 = lm(Attrition~Age, data = attrition.lm)

# inormation about model
summary(attritionfit)
## 
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = attrition.lm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37829 -0.19156 -0.13330 -0.06808  1.02288 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.202e-01  8.883e-02   6.981 5.83e-12 ***
## Age               -1.030e-02  2.353e-03  -4.379 1.34e-05 ***
## MonthlyIncome     -5.761e-05  1.400e-05  -4.114 4.26e-05 ***
## Age:MonthlyIncome  1.130e-06  3.153e-07   3.583 0.000358 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3599 on 866 degrees of freedom
## Multiple R-squared:  0.04538,    Adjusted R-squared:  0.04207 
## F-statistic: 13.72 on 3 and 866 DF,  p-value: 9.485e-09
confint(attritionfit)
##                           2.5 %        97.5 %
## (Intercept)        4.457982e-01  7.945094e-01
## Age               -1.491937e-02 -5.684270e-03
## MonthlyIncome     -8.508807e-05 -3.012455e-05
## Age:MonthlyIncome  5.109013e-07  1.748437e-06
attritionfit$coefficients
##       (Intercept)               Age     MonthlyIncome Age:MonthlyIncome 
##      6.201538e-01     -1.030182e-02     -5.760631e-05      1.129669e-06
hist(attritionfit$residuals, col = "blue", main = "Histogram of Residuals")

sqrt(mean(attritionfit$residuals^2))
## [1] 0.3590231
sum((attritionfit$residuals)^2)
## [1] 112.1409

Train and Test for Linear Regression Attrition

set.seed(9)
splitPerc <- .8
trainIndices <- sample(1:dim(attrition.lm)[1],round(splitPerc * dim(attrition.lm)[1]))
dfTrain2 <- attrition.lm[trainIndices,]
dfTest2 <- attrition.lm[-trainIndices,]
dfTrain2 <- na.omit(dfTrain2)
dfTest2 <- na.omit(dfTest2)
# Best LM model for Attrition
LR.fit <- lm(Attrition~Age*MonthlyIncome, data = dfTrain2)
summary(LR.fit)
## 
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = dfTrain2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38770 -0.19212 -0.13098 -0.06281  0.97511 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.504e-01  9.709e-02   6.698 4.37e-11 ***
## Age               -1.093e-02  2.578e-03  -4.238 2.56e-05 ***
## MonthlyIncome     -6.789e-05  1.542e-05  -4.403 1.24e-05 ***
## Age:MonthlyIncome  1.351e-06  3.464e-07   3.900 0.000106 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3576 on 692 degrees of freedom
## Multiple R-squared:  0.05128,    Adjusted R-squared:  0.04716 
## F-statistic: 12.47 on 3 and 692 DF,  p-value: 6.013e-08
LR.Preds <- predict(LR.fit, newdata = dfTest2)
as.data.frame(LR.Preds)
##          LR.Preds
## 4    0.0436343376
## 7    0.1757354301
## 8    0.1261548668
## 11  -0.0096785475
## 20   0.1270006629
## 23   0.1247949152
## 25   0.1277521340
## 33   0.0506196216
## 40   0.2390046050
## 47   0.1345714009
## 55   0.1965221637
## 56   0.1737091788
## 58   0.1978393508
## 62   0.1203358821
## 66   0.2193424035
## 81   0.1604895283
## 83   0.1470060521
## 84   0.1390505143
## 88   0.1907521927
## 92   0.1228033569
## 93   0.2086153683
## 99   0.2421487801
## 100  0.2064850283
## 101  0.2893754885
## 104  0.2952330668
## 105  0.1326403819
## 125  0.2026924251
## 129  0.1404653443
## 132  0.1072503453
## 133  0.1409489462
## 152  0.1675968220
## 157  0.1345922483
## 160  0.0442400680
## 161  0.1822631461
## 169  0.0952463419
## 176  0.3390744220
## 178  0.2993354296
## 185  0.1670835191
## 186  0.2821553040
## 188  0.0664447806
## 198  0.0925916183
## 206  0.1479196979
## 216  0.1807685996
## 220  0.2720391955
## 235  0.0141801220
## 236  0.0560347834
## 238  0.1447577332
## 240  0.0993334890
## 251  0.2429442292
## 254  0.2347121354
## 255  0.0226159823
## 256  0.1565770557
## 257  0.2390566349
## 260  0.2414836826
## 264  0.2715256733
## 271  0.1220118722
## 275  0.1692231504
## 281  0.1035349089
## 283  0.1230065851
## 285  0.1880249177
## 286  0.1683642833
## 288  0.2203874771
## 298  0.1454284601
## 335  0.2281072295
## 338  0.1084870401
## 346  0.0212883864
## 347  0.1902545233
## 366  0.1408643103
## 370  0.1760073288
## 380  0.2641960260
## 392  0.2300298652
## 394  0.1297804955
## 403  0.1275064367
## 405  0.2151568837
## 420 -0.0092634384
## 425  0.2023571148
## 427  0.2876757891
## 446  0.1217159934
## 453  0.1480852077
## 456 -0.0615879524
## 461  0.1680604569
## 463  0.1007597374
## 466  0.1698090307
## 470  0.0876810399
## 480 -0.0001150151
## 482  0.2503211157
## 492  0.0980652835
## 493  0.1649303386
## 496  0.1093654833
## 497  0.1397739409
## 498  0.2043884617
## 499  0.1561042012
## 505  0.0611739359
## 508  0.1036572356
## 510 -0.0421241154
## 511  0.1466417344
## 515  0.3177324288
## 518  0.1605517957
## 526  0.0990615121
## 530  0.0769086639
## 531  0.0890132918
## 533  0.2559326596
## 540  0.1338977221
## 548  0.1426717240
## 580  0.1432781612
## 591  0.3025435343
## 592  0.1778235817
## 594  0.1094560472
## 605  0.2542096746
## 607  0.0702584011
## 608  0.1423030413
## 613  0.2266480798
## 615  0.1237552980
## 621  0.3122184932
## 623  0.2067925074
## 626  0.2762428213
## 627  0.2404771972
## 629  0.1196300431
## 630  0.0894892230
## 631  0.0664003641
## 632  0.0616623626
## 642  0.1516242819
## 644  0.2281623063
## 647  0.3707016773
## 649  0.1790239249
## 673  0.1971254325
## 685  0.2628221331
## 693  0.1157564397
## 694  0.2600090752
## 696  0.1222708191
## 702  0.1533350660
## 708  0.2567468125
## 710  0.0976986242
## 713  0.1180365042
## 721  0.0770797638
## 722  0.1427401023
## 724  0.0863588681
## 732  0.1445283155
## 743  0.1693015541
## 745  0.2097527218
## 746  0.3066642860
## 752  0.2517259239
## 754  0.1056173402
## 757  0.1851663217
## 763  0.1254043467
## 768  0.0617661000
## 772  0.0875552102
## 774  0.1917184296
## 775  0.0784585878
## 777  0.2660568930
## 779  0.2820336150
## 781  0.1630651052
## 782  0.1549465990
## 787  0.1061760211
## 792  0.2280744551
## 796  0.2117041144
## 802  0.1403818648
## 803 -0.0353962014
## 807  0.1645446326
## 813  0.2370011335
## 818  0.0171994289
## 820  0.0481126874
## 822  0.1344335052
## 826  0.0347670672
## 836  0.0878261367
## 837  0.1923883452
## 842  0.1040802539
## 846  0.2073482600
## 848  0.0779881835
## 851  0.1930101432
## 854  0.1741842048
## 861  0.0959029745
## 863  0.1623398117
## 869  0.1268218404
# Metrics for model
MSPE <- data.frame(Observed = dfTest2$Attrition, Predicted = LR.Preds)
MSPE$Resisdual <- MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual <- MSPE$Resisdual^2
MSPE
##     Observed     Predicted     Resisdual SquaredResidual
## 4          0  0.0436343376 -0.0436343376    1.903955e-03
## 7          0  0.1757354301 -0.1757354301    3.088294e-02
## 8          0  0.1261548668 -0.1261548668    1.591505e-02
## 11         0 -0.0096785475  0.0096785475    9.367428e-05
## 20         0  0.1270006629 -0.1270006629    1.612917e-02
## 23         0  0.1247949152 -0.1247949152    1.557377e-02
## 25         0  0.1277521340 -0.1277521340    1.632061e-02
## 33         0  0.0506196216 -0.0506196216    2.562346e-03
## 40         0  0.2390046050 -0.2390046050    5.712320e-02
## 47         1  0.1345714009  0.8654285991    7.489667e-01
## 55         0  0.1965221637 -0.1965221637    3.862096e-02
## 56         0  0.1737091788 -0.1737091788    3.017488e-02
## 58         0  0.1978393508 -0.1978393508    3.914041e-02
## 62         0  0.1203358821 -0.1203358821    1.448072e-02
## 66         0  0.2193424035 -0.2193424035    4.811109e-02
## 81         1  0.1604895283  0.8395104717    7.047778e-01
## 83         0  0.1470060521 -0.1470060521    2.161078e-02
## 84         0  0.1390505143 -0.1390505143    1.933505e-02
## 88         1  0.1907521927  0.8092478073    6.548820e-01
## 92         1  0.1228033569  0.8771966431    7.694740e-01
## 93         1  0.2086153683  0.7913846317    6.262896e-01
## 99         0  0.2421487801 -0.2421487801    5.863603e-02
## 100        0  0.2064850283 -0.2064850283    4.263607e-02
## 101        0  0.2893754885 -0.2893754885    8.373817e-02
## 104        0  0.2952330668 -0.2952330668    8.716256e-02
## 105        0  0.1326403819 -0.1326403819    1.759347e-02
## 125        0  0.2026924251 -0.2026924251    4.108422e-02
## 129        0  0.1404653443 -0.1404653443    1.973051e-02
## 132        0  0.1072503453 -0.1072503453    1.150264e-02
## 133        0  0.1409489462 -0.1409489462    1.986661e-02
## 152        0  0.1675968220 -0.1675968220    2.808869e-02
## 157        0  0.1345922483 -0.1345922483    1.811507e-02
## 160        0  0.0442400680 -0.0442400680    1.957184e-03
## 161        0  0.1822631461 -0.1822631461    3.321985e-02
## 169        0  0.0952463419 -0.0952463419    9.071866e-03
## 176        0  0.3390744220 -0.3390744220    1.149715e-01
## 178        1  0.2993354296  0.7006645704    4.909308e-01
## 185        0  0.1670835191 -0.1670835191    2.791690e-02
## 186        0  0.2821553040 -0.2821553040    7.961162e-02
## 188        0  0.0664447806 -0.0664447806    4.414909e-03
## 198        0  0.0925916183 -0.0925916183    8.573208e-03
## 206        0  0.1479196979 -0.1479196979    2.188024e-02
## 216        0  0.1807685996 -0.1807685996    3.267729e-02
## 220        0  0.2720391955 -0.2720391955    7.400532e-02
## 235        0  0.0141801220 -0.0141801220    2.010759e-04
## 236        1  0.0560347834  0.9439652166    8.910703e-01
## 238        0  0.1447577332 -0.1447577332    2.095480e-02
## 240        0  0.0993334890 -0.0993334890    9.867142e-03
## 251        0  0.2429442292 -0.2429442292    5.902190e-02
## 254        1  0.2347121354  0.7652878646    5.856655e-01
## 255        0  0.0226159823 -0.0226159823    5.114827e-04
## 256        0  0.1565770557 -0.1565770557    2.451637e-02
## 257        0  0.2390566349 -0.2390566349    5.714807e-02
## 260        0  0.2414836826 -0.2414836826    5.831437e-02
## 264        0  0.2715256733 -0.2715256733    7.372619e-02
## 271        0  0.1220118722 -0.1220118722    1.488690e-02
## 275        0  0.1692231504 -0.1692231504    2.863647e-02
## 281        0  0.1035349089 -0.1035349089    1.071948e-02
## 283        0  0.1230065851 -0.1230065851    1.513062e-02
## 285        0  0.1880249177 -0.1880249177    3.535337e-02
## 286        0  0.1683642833 -0.1683642833    2.834653e-02
## 288        0  0.2203874771 -0.2203874771    4.857064e-02
## 298        1  0.1454284601  0.8545715399    7.302925e-01
## 335        0  0.2281072295 -0.2281072295    5.203291e-02
## 338        0  0.1084870401 -0.1084870401    1.176944e-02
## 346        0  0.0212883864 -0.0212883864    4.531954e-04
## 347        0  0.1902545233 -0.1902545233    3.619678e-02
## 366        0  0.1408643103 -0.1408643103    1.984275e-02
## 370        0  0.1760073288 -0.1760073288    3.097858e-02
## 380        1  0.2641960260  0.7358039740    5.414075e-01
## 392        0  0.2300298652 -0.2300298652    5.291374e-02
## 394        0  0.1297804955 -0.1297804955    1.684298e-02
## 403        0  0.1275064367 -0.1275064367    1.625789e-02
## 405        0  0.2151568837 -0.2151568837    4.629248e-02
## 420        0 -0.0092634384  0.0092634384    8.581129e-05
## 425        0  0.2023571148 -0.2023571148    4.094840e-02
## 427        0  0.2876757891 -0.2876757891    8.275736e-02
## 446        0  0.1217159934 -0.1217159934    1.481478e-02
## 453        1  0.1480852077  0.8519147923    7.257588e-01
## 456        0 -0.0615879524  0.0615879524    3.793076e-03
## 461        0  0.1680604569 -0.1680604569    2.824432e-02
## 463        0  0.1007597374 -0.1007597374    1.015252e-02
## 466        0  0.1698090307 -0.1698090307    2.883511e-02
## 470        0  0.0876810399 -0.0876810399    7.687965e-03
## 480        0 -0.0001150151  0.0001150151    1.322847e-08
## 482        0  0.2503211157 -0.2503211157    6.266066e-02
## 492        0  0.0980652835 -0.0980652835    9.616800e-03
## 493        0  0.1649303386 -0.1649303386    2.720202e-02
## 496        0  0.1093654833 -0.1093654833    1.196081e-02
## 497        0  0.1397739409 -0.1397739409    1.953675e-02
## 498        0  0.2043884617 -0.2043884617    4.177464e-02
## 499        0  0.1561042012 -0.1561042012    2.436852e-02
## 505        0  0.0611739359 -0.0611739359    3.742250e-03
## 508        0  0.1036572356 -0.1036572356    1.074482e-02
## 510        1 -0.0421241154  1.0421241154    1.086023e+00
## 511        0  0.1466417344 -0.1466417344    2.150380e-02
## 515        0  0.3177324288 -0.3177324288    1.009539e-01
## 518        0  0.1605517957 -0.1605517957    2.577688e-02
## 526        0  0.0990615121 -0.0990615121    9.813183e-03
## 530        1  0.0769086639  0.9230913361    8.520976e-01
## 531        0  0.0890132918 -0.0890132918    7.923366e-03
## 533        0  0.2559326596 -0.2559326596    6.550153e-02
## 540        0  0.1338977221 -0.1338977221    1.792860e-02
## 548        0  0.1426717240 -0.1426717240    2.035522e-02
## 580        0  0.1432781612 -0.1432781612    2.052863e-02
## 591        0  0.3025435343 -0.3025435343    9.153259e-02
## 592        1  0.1778235817  0.8221764183    6.759741e-01
## 594        0  0.1094560472 -0.1094560472    1.198063e-02
## 605        1  0.2542096746  0.7457903254    5.562032e-01
## 607        0  0.0702584011 -0.0702584011    4.936243e-03
## 608        0  0.1423030413 -0.1423030413    2.025016e-02
## 613        1  0.2266480798  0.7733519202    5.980732e-01
## 615        0  0.1237552980 -0.1237552980    1.531537e-02
## 621        1  0.3122184932  0.6877815068    4.730434e-01
## 623        0  0.2067925074 -0.2067925074    4.276314e-02
## 626        0  0.2762428213 -0.2762428213    7.631010e-02
## 627        0  0.2404771972 -0.2404771972    5.782928e-02
## 629        0  0.1196300431 -0.1196300431    1.431135e-02
## 630        0  0.0894892230 -0.0894892230    8.008321e-03
## 631        0  0.0664003641 -0.0664003641    4.409008e-03
## 632        0  0.0616623626 -0.0616623626    3.802247e-03
## 642        0  0.1516242819 -0.1516242819    2.298992e-02
## 644        1  0.2281623063  0.7718376937    5.957334e-01
## 647        1  0.3707016773  0.6292983227    3.960164e-01
## 649        0  0.1790239249 -0.1790239249    3.204957e-02
## 673        0  0.1971254325 -0.1971254325    3.885844e-02
## 685        0  0.2628221331 -0.2628221331    6.907547e-02
## 693        0  0.1157564397 -0.1157564397    1.339955e-02
## 694        1  0.2600090752  0.7399909248    5.475866e-01
## 696        0  0.1222708191 -0.1222708191    1.495015e-02
## 702        0  0.1533350660 -0.1533350660    2.351164e-02
## 708        0  0.2567468125 -0.2567468125    6.591893e-02
## 710        0  0.0976986242 -0.0976986242    9.545021e-03
## 713        0  0.1180365042 -0.1180365042    1.393262e-02
## 721        0  0.0770797638 -0.0770797638    5.941290e-03
## 722        0  0.1427401023 -0.1427401023    2.037474e-02
## 724        0  0.0863588681 -0.0863588681    7.457854e-03
## 732        1  0.1445283155  0.8554716845    7.318318e-01
## 743        0  0.1693015541 -0.1693015541    2.866302e-02
## 745        0  0.2097527218 -0.2097527218    4.399620e-02
## 746        0  0.3066642860 -0.3066642860    9.404298e-02
## 752        1  0.2517259239  0.7482740761    5.599141e-01
## 754        1  0.1056173402  0.8943826598    7.999203e-01
## 757        0  0.1851663217 -0.1851663217    3.428657e-02
## 763        0  0.1254043467 -0.1254043467    1.572625e-02
## 768        0  0.0617661000 -0.0617661000    3.815051e-03
## 772        0  0.0875552102 -0.0875552102    7.665915e-03
## 774        1  0.1917184296  0.8082815704    6.533191e-01
## 775        0  0.0784585878 -0.0784585878    6.155750e-03
## 777        0  0.2660568930 -0.2660568930    7.078627e-02
## 779        0  0.2820336150 -0.2820336150    7.954296e-02
## 781        1  0.1630651052  0.8369348948    7.004600e-01
## 782        0  0.1549465990 -0.1549465990    2.400845e-02
## 787        0  0.1061760211 -0.1061760211    1.127335e-02
## 792        0  0.2280744551 -0.2280744551    5.201796e-02
## 796        1  0.2117041144  0.7882958856    6.214104e-01
## 802        0  0.1403818648 -0.1403818648    1.970707e-02
## 803        0 -0.0353962014  0.0353962014    1.252891e-03
## 807        0  0.1645446326 -0.1645446326    2.707494e-02
## 813        0  0.2370011335 -0.2370011335    5.616954e-02
## 818        0  0.0171994289 -0.0171994289    2.958204e-04
## 820        1  0.0481126874  0.9518873126    9.060895e-01
## 822        0  0.1344335052 -0.1344335052    1.807237e-02
## 826        0  0.0347670672 -0.0347670672    1.208749e-03
## 836        0  0.0878261367 -0.0878261367    7.713430e-03
## 837        0  0.1923883452 -0.1923883452    3.701328e-02
## 842        0  0.1040802539 -0.1040802539    1.083270e-02
## 846        0  0.2073482600 -0.2073482600    4.299330e-02
## 848        0  0.0779881835 -0.0779881835    6.082157e-03
## 851        1  0.1930101432  0.8069898568    6.512326e-01
## 854        0  0.1741842048 -0.1741842048    3.034014e-02
## 861        0  0.0959029745 -0.0959029745    9.197381e-03
## 863        1  0.1623398117  0.8376601883    7.016746e-01
## 869        0  0.1268218404 -0.1268218404    1.608378e-02
mean(MSPE$SquaredResidual)
## [1] 0.1362915
RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
RMSE
## [1] 0.3691768

Linear Regression for Salary

Change Job Level to Character

attrition.jl <- attrition.df
attrition.jl$JobLevel <- gsub(1, "1", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(2, "2", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(3, "3", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(4, "4", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(5, "5", attrition.jl$JobLevel)
attrition.jl$JobLevel <- as.character(attrition.jl$JobLevel)

# Job level model
JLModel_fit = lm(MonthlyIncome~JobLevel, data = attrition.jl)
summary(JLModel_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.jl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4642.2  -668.0  -107.3   668.3  4412.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2743.82      69.69   39.37   <2e-16 ***
## JobLevel2    2800.46      99.89   28.04   <2e-16 ***
## JobLevel3    7108.38     130.24   54.58   <2e-16 ***
## JobLevel4   12509.83     177.45   70.50   <2e-16 ***
## JobLevel5   16480.15     219.18   75.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared:  0.9248, Adjusted R-squared:  0.9244 
## F-statistic:  2658 on 4 and 865 DF,  p-value: < 2.2e-16
preds <- predict(JLModel_fit)
hist(JLModel_fit$residuals, col = "blue", main = "Histogram of Residuals")

plot(JLModel_fit$fitted.values,JLModel_fit$residuals, main = "Plot of Residuals v. Fitted Values")

attrition.jl %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attrition.jl, aes(x = JobLevel, y = preds, col = "red"))

Predict Using Single Variable

Model1_fit = lm(MonthlyIncome~JobLevel, data = attrition.df)
summary(Model1_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5037.1  -928.2    80.1   697.1  3723.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1793.93     101.68  -17.64   <2e-16 ***
## JobLevel     4013.67      43.98   91.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1413 on 868 degrees of freedom
## Multiple R-squared:  0.9056, Adjusted R-squared:  0.9055 
## F-statistic:  8329 on 1 and 868 DF,  p-value: < 2.2e-16
hist(Model1_fit$residuals, col = "blue", main = "Histogram of Residuals")

SalaryPredict <- predict(Model1_fit, noSalary, interval = "confidence") 
SalaryPredict <- as.data.frame(SalaryPredict)
noSalary$MonthlyIncome <- SalaryPredict[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + geom_point() + geom_line(data = noSalary, aes(x = JobLevel, y = MonthlyIncome, col = "red"))

Predict Using Exponential Variable (Squared)

# Good Exponential Model 
attriton.df2 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2))
fit = lm(MonthlyIncome~JobLevel+JobLevel2, attriton.df2)
summary(fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2, data = attriton.df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4502.5  -744.2  -131.4   656.1  4177.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   299.75     190.65   1.572    0.116    
## JobLevel     1944.17     169.12  11.496   <2e-16 ***
## JobLevel2     397.80      31.56  12.603   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1300 on 867 degrees of freedom
## Multiple R-squared:  0.9202, Adjusted R-squared:   0.92 
## F-statistic:  5001 on 2 and 867 DF,  p-value: < 2.2e-16
preds2 <- predict(fit)
hist(fit$residuals, col = "blue", main = "Histogram of Residuals")

plot(fit$fitted.values,fit$residuals, main = "Plot of Residuals v. Fitted Values")

attriton.df2 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + geom_point() + geom_line(data = attriton.df2, aes(x = JobLevel, y = preds, col = "red"))

noSalary <- noSalary %>% mutate(JobLevel2 = (JobLevel^2))
test2 <- predict(fit, noSalary, interval = "confidence") 
test2 <- as.data.frame(test2)
noSalary$Salary <- test2[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = Salary)) + geom_point() + geom_line(data = noSalary, aes(x = JobLevel, y = Salary , col = "red"))

Predict Using Exponential Variable (Cubed)

# Adding extra exponential variable helped
attriton.df3 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2), 
                                       JobLevel3 = (JobLevel^3))
fit2 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3, attriton.df3)
summary(fit2)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3, 
##     data = attriton.df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4796.8  -669.9  -108.7   652.2  4456.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3012.60     435.69   6.915 9.11e-12 ***
## JobLevel    -2176.07     620.83  -3.505  0.00048 ***
## JobLevel2    2125.20     252.82   8.406  < 2e-16 ***
## JobLevel3    -207.57      30.15  -6.884 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1267 on 866 degrees of freedom
## Multiple R-squared:  0.9244, Adjusted R-squared:  0.9241 
## F-statistic:  3528 on 3 and 866 DF,  p-value: < 2.2e-16
preds2 <- predict(fit2)
hist(fit2$residuals, col = "blue", main = "Histogram of Residuals")

plot(fit2$fitted.values,fit2$residuals, main = "Plot of Residuals v. Fitted Values")

attriton.df3 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attriton.df3, aes(x = JobLevel, y = preds2, col = "red"))

noSalary <- noSalary %>% mutate(JobLevel3 = (JobLevel^3))
test3 <- predict(fit2, noSalary, interval = "confidence") 
test3 <- as.data.frame(test3)
noSalary$Salary2 <- test3[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = Salary2)) + 
  geom_point() + 
  geom_line(data = noSalary, aes(x = JobLevel, y = Salary2 , col = "red"))

Predict Using Exponential Variable (4th Root)

# Adding extra exponential variable hurt
attriton.df4 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2), 
                                       JobLevel3 = (JobLevel^3), 
                                       JobLevel4 = (JobLevel^4))
fit3 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3+JobLevel4, attriton.df4)
summary(fit3)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3 + 
##     JobLevel4, data = attriton.df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4642.2  -668.0  -107.3   668.3  4412.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -245.90    1597.80  -0.154   0.8777  
## JobLevel     4177.61    3061.29   1.365   0.1727  
## JobLevel2   -1910.41    1920.81  -0.995   0.3202  
## JobLevel3     810.46     481.29   1.684   0.0926 .
## JobLevel4     -87.95      41.50  -2.119   0.0343 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared:  0.9248, Adjusted R-squared:  0.9244 
## F-statistic:  2658 on 4 and 865 DF,  p-value: < 2.2e-16
preds3 <- predict(fit3)
hist(fit3$residuals, col = "blue", main = "Histogram of Residuals")

plot(fit3$fitted.values,fit3$residuals, main = "Plot of Residuals v. Fitted Values")

attriton.df4 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attriton.df4, aes(x = JobLevel, y = preds3, col = "red"))

Mulitple Variable Linear Regression

# best !!!
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole, data = dfTrain)
summary(Model2_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole, data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3597.1  -707.3   -13.4   634.5  4018.7 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -179.200    277.614  -0.646  0.51881    
## Age                             10.725      5.265   2.037  0.04200 *  
## JobLevel                      2988.030     76.238  39.193  < 2e-16 ***
## JobRoleHuman Resources        -502.938    285.061  -1.764  0.07810 .  
## JobRoleLaboratory Technician  -803.863    191.210  -4.204 2.95e-05 ***
## JobRoleManager                3823.428    254.520  15.022  < 2e-16 ***
## JobRoleManufacturing Director  -22.335    184.840  -0.121  0.90386    
## JobRoleResearch Director      3852.704    242.579  15.882  < 2e-16 ***
## JobRoleResearch Scientist     -474.371    191.176  -2.481  0.01331 *  
## JobRoleSales Executive        -261.845    164.079  -1.596  0.11096    
## JobRoleSales Representative   -684.793    238.703  -2.869  0.00424 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1099 on 729 degrees of freedom
## Multiple R-squared:  0.9446, Adjusted R-squared:  0.9439 
## F-statistic:  1244 on 10 and 729 DF,  p-value: < 2.2e-16
preds <- predict(Model2_fit)

hist(Model2_fit$residuals, col = "blue", main = "Histogram of Residuals")

sqrt(sum((Model1_fit$residuals)^2))
## [1] 41638.29
sqrt(sum((Model2_fit$residuals)^2))
## [1] 29679.15
# better
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole+JobSatisfaction, data = dfTrain)
summary(Model2_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole + JobSatisfaction, 
##     data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3575.5  -690.8   -26.4   647.2  4011.0 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -252.502    295.590  -0.854  0.39326    
## Age                             10.622      5.269   2.016  0.04416 *  
## JobLevel                      2988.926     76.273  39.187  < 2e-16 ***
## JobRoleHuman Resources        -498.394    285.223  -1.747  0.08099 .  
## JobRoleLaboratory Technician  -800.257    191.337  -4.182 3.24e-05 ***
## JobRoleManager                3830.739    254.803  15.034  < 2e-16 ***
## JobRoleManufacturing Director  -19.484    184.943  -0.105  0.91612    
## JobRoleResearch Director      3859.614    242.846  15.893  < 2e-16 ***
## JobRoleResearch Scientist     -473.397    191.243  -2.475  0.01354 *  
## JobRoleSales Executive        -259.069    164.177  -1.578  0.11500    
## JobRoleSales Representative   -678.904    238.919  -2.842  0.00461 ** 
## JobSatisfaction                 26.447     36.535   0.724  0.46937    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1100 on 728 degrees of freedom
## Multiple R-squared:  0.9447, Adjusted R-squared:  0.9438 
## F-statistic:  1130 on 11 and 728 DF,  p-value: < 2.2e-16
Model2_Preds = predict(Model2_fit, newdata = dfTest)
as.data.frame(Model2_Preds)
##     Model2_Preds
## 4       8900.911
## 7       2777.885
## 8       5965.097
## 11     19058.970
## 20      6219.181
## 23      5461.196
## 33     16075.464
## 40      2492.601
## 55      6140.947
## 56      2398.131
## 58      2270.663
## 62      9176.022
## 66      2655.836
## 81      6171.272
## 83      6188.857
## 88      2360.844
## 93      2408.535
## 99      2297.110
## 100     2350.221
## 101     2249.418
## 104     2533.788
## 132     6092.565
## 133     6277.278
## 152     5911.985
## 157     5944.070
## 160    13046.477
## 161     5349.552
## 169     5830.763
## 176     2538.991
## 178     2565.655
## 185     2429.780
## 206     2756.858
## 216     2767.262
## 220     2644.996
## 235    16096.491
## 236     8927.576
## 240    13131.455
## 254     2593.770
## 255    13051.679
## 264     2471.357
## 271     6023.629
## 275    19188.866
## 283     6256.033
## 286     5354.972
## 288     2403.115
## 298     5365.594
## 346    11884.853
## 347     6146.367
## 380     2661.039
## 392     5848.251
## 394     9118.996
## 403     2514.759
## 405     2661.256
## 420    12961.280
## 425     5790.155
## 427     2661.039
## 446     2868.066
## 453     5943.852
## 461     2461.647
## 463     9092.114
## 470     6295.298
## 482     2698.108
## 492    16120.382
## 493     5853.671
## 498     2761.842
## 505    16141.408
## 508     9049.842
## 511     5975.719
## 515     2386.378
## 518     2435.200
## 526     9071.087
## 530     2910.773
## 531     9197.267
## 533     2650.416
## 540     5938.868
## 580     6139.187
## 592     5880.336
## 594     2483.110
## 605     2418.462
## 607     6172.342
## 608     5896.378
## 613     2313.152
## 615     8853.220
## 621     2254.402
## 623     2729.975
## 626     2629.172
## 627     5800.559
## 629     9331.916
## 630     9128.331
## 631     2953.045
## 632     8922.156
## 642     6176.692
## 644     2650.634
## 647     2206.711
## 649     5864.511
## 693     6277.278
## 694     2687.486
## 696     2662.559
## 702     5906.783
## 708     2620.217
## 710    16091.506
## 721     9167.161
## 732     5959.895
## 743     5750.768
## 745     5811.182
## 752     2439.707
## 757     6202.921
## 763     5954.692
## 768     8906.549
## 772     8970.283
## 774     2724.773
## 777     2636.042
## 779     2318.137
## 781     2545.931
## 782     6218.963
## 787    13166.097
## 796     2640.012
## 802     5901.363
## 803    15974.660
## 807     5811.182
## 813     5800.559
## 818    13003.769
## 826    19146.159
## 836     8901.129
## 842     8550.378
## 846     5248.966
## 851     5874.916
## 854     6049.223
## 863     5933.230
## 869     2820.374
MSPE = data.frame(Observed = dfTest$MonthlyIncome, Predicted = Model2_Preds)
MSPE$Resisdual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Resisdual^2
MSPE
##     Observed Predicted   Resisdual SquaredResidual
## 4      10422  8900.911  1521.08860    2313710.5166
## 7       2127  2777.885  -650.88466     423650.8448
## 8       6694  5965.097   728.90293     531299.4768
## 11     19392 19058.970   333.03011     110909.0536
## 20      5033  6219.181 -1186.18116    1407025.7379
## 23      5679  5461.196   217.80438      47438.7490
## 33     16872 16075.464   796.53617     634469.8712
## 40      2791  2492.601   298.39878      89041.8349
## 55      4424  6140.947 -1716.94731    2947908.0496
## 56      2289  2398.131  -109.13076      11909.5233
## 58      4723  2270.663  2452.33749    6013959.1618
## 62      7094  9176.022 -2082.02247    4334817.5616
## 66      3298  2655.836   642.16355     412374.0219
## 81      4777  6171.272 -1394.27170    1943993.5702
## 83      5321  6188.857  -867.85676     753175.3626
## 88      3743  2360.844  1382.15634    1910356.1523
## 93      2090  2408.535  -318.53539     101464.7941
## 99      3180  2297.110   882.89047     779495.5776
## 100     3294  2350.221   943.77870     890718.2267
## 101     2099  2249.418  -150.41780      22625.5151
## 104     2819  2533.788   285.21176      81345.7472
## 132     4507  6092.565 -1585.56533    2514017.4002
## 133     4523  6277.278 -1754.27751    3077489.5972
## 152     5396  5911.985  -515.98530     266240.8313
## 157     5677  5944.070  -267.07009      71326.4342
## 160    13191 13046.477   144.52343      20887.0210
## 161     5126  5349.552  -223.55203      49975.5116
## 169     5577  5830.763  -253.76262      64395.4672
## 176     2070  2538.991  -468.99055     219952.1404
## 178     2042  2565.655  -523.65530     274214.8779
## 185     3755  2429.780  1325.21990    1756207.7894
## 206     2559  2756.858  -197.85768      39147.6625
## 216     2345  2767.262  -422.26231     178305.4575
## 220     2875  2644.996   230.00363      52901.6697
## 235    16959 16096.491   862.50919     743922.1026
## 236    10609  8927.576  1681.42385    2827186.1504
## 240    13269 13131.455   137.54459      18918.5149
## 254     2956  2593.770   362.22970     131210.3553
## 255    13757 13051.679   705.32111     497477.8737
## 264     2157  2471.357  -314.35651      98820.0131
## 271     5154  6023.629  -869.62889     756254.3992
## 275    18722 19188.866  -466.86641     217964.2414
## 283     6513  6256.033   256.96719      66032.1390
## 286     5207  5354.972  -147.97207      21895.7348
## 288     2661  2403.115   257.88465      66504.4938
## 298     6074  5365.594   708.40557     501838.4534
## 346    13341 11884.853  1456.14708    2120364.3146
## 347     4033  6146.367 -2113.36735    4466321.5414
## 380     2132  2661.039  -529.03877     279882.0155
## 392     4157  5848.251 -1691.25118    2860330.5385
## 394    10266  9118.996  1147.00374    1315617.5802
## 403     2105  2514.759  -409.75893     167902.3827
## 405     2559  2661.256  -102.25649      10456.3905
## 420    11935 12961.280 -1026.28001    1053250.6597
## 425     4724  5790.155 -1066.15482    1136686.0969
## 427     1274  2661.039 -1387.03877    1923876.5372
## 446     3420  2868.066   551.93419     304631.3488
## 453     5813  5943.852  -130.85236      17122.3413
## 461     2093  2461.647  -368.64716     135900.7291
## 463     9208  9092.114   115.88622      13429.6155
## 470     4450  6295.298 -1845.29804    3405124.8382
## 482     2356  2698.108  -342.10814     117037.9807
## 492    16880 16120.382   759.61849     577020.2501
## 493     5869  5853.671    15.32878        234.9716
## 498     2326  2761.842  -435.84227     189958.4824
## 505    17169 16141.408  1027.59151    1055944.3094
## 508    10596  9049.842  1546.15791    2390604.2746
## 511     5343  5975.719  -632.71943     400333.8739
## 515     2610  2386.378   223.62233      50006.9457
## 518     2766  2435.200   330.79986     109428.5481
## 526     7525  9071.087 -1546.08680    2390384.3971
## 530     4963  2910.773  2052.22704    4211635.8388
## 531     8823  9197.267  -374.26718     140075.9203
## 533     2700  2650.416    49.58359       2458.5323
## 540     5155  5938.868  -783.86778     614448.6951
## 580     6623  6139.187   483.81309     234075.1080
## 592     4599  5880.336 -1281.33597    1641821.8586
## 594     3211  2483.110   727.89040     529824.4384
## 605     2760  2418.462   341.53754     116647.8895
## 607     5473  6172.342  -699.34185     489079.0183
## 608     5605  5896.378  -291.37836      84901.3497
## 613     2707  2313.152   393.84807     155116.3039
## 615     7264  8853.220 -1589.21967    2525619.1708
## 621     2926  2254.402   671.59761     451043.3534
## 623     3280  2729.975   550.02480     302527.2753
## 626     2517  2629.172  -112.17170      12582.4909
## 627     4162  5800.559 -1638.55944    2684877.0545
## 629    10976  9331.916  1644.08413    2703012.6323
## 630     8621  9128.331  -507.33074     257384.4782
## 631     2662  2953.045  -291.04465      84706.9858
## 632     9888  8922.156   965.84389     932854.4145
## 642     4448  6176.692 -1728.69174    2988375.1318
## 644     2307  2650.634  -343.63414     118084.4216
## 647     1904  2206.711  -302.71066      91633.7417
## 649     4312  5864.511 -1552.51130    2410291.3328
## 693     6781  6277.278   503.72249     253736.3426
## 694     2285  2687.486  -402.48579     161994.8091
## 696     3294  2662.559   631.44112     398717.8826
## 702     5714  5906.783  -192.78299      37165.2806
## 708     2109  2620.217  -511.21732     261343.1506
## 710    17924 16091.506  1832.49378    3358033.4363
## 721     9434  9167.161   266.83949      71203.3123
## 732     5238  5959.895  -721.89476     521132.0444
## 743     4285  5750.768 -1465.76837    2148476.9158
## 745     4907  5811.182  -904.18180     817544.7261
## 752     2302  2439.707  -137.70717      18963.2650
## 757     4014  6202.921 -2188.92103    4791375.2930
## 763     6151  5954.692   196.30755      38536.6555
## 768    10932  8906.549  2025.45083    4102451.0523
## 772    10453  8970.283  1482.71670    2198448.8153
## 774     3388  2724.773   663.22711     439870.1974
## 777     2064  2636.042  -572.04199     327232.0381
## 779     2570  2318.137   251.86349      63435.2159
## 781     3140  2545.931   594.06929     352918.3159
## 782     4553  6218.963 -1665.96343    2775434.1481
## 787    13116 13166.097   -50.09652       2509.6613
## 796     3348  2640.012   707.98822     501247.3129
## 802     6582  5901.363   680.63705     463266.7977
## 803    17068 15974.660  1093.33967    1195391.6403
## 807     6232  5811.182   420.81820     177087.9581
## 813     4260  5800.559 -1540.55944    2373323.4033
## 818    11691 13003.769 -1312.76943    1723363.5703
## 826    19627 19146.159   480.84074     231207.8161
## 836     8834  8901.129   -67.12913       4506.3204
## 842     7988  8550.378  -562.37780     316268.7865
## 846     4558  5248.966  -690.96626     477434.3710
## 851     4559  5874.916 -1315.91593    1731634.7223
## 854     5661  6049.223  -388.22349     150717.4766
## 863     5304  5933.230  -629.23001     395930.4057
## 869     4477  2820.374  1656.62592    2744409.4370
mean(MSPE$SquaredResidual)
## [1] 970738.6
RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
RMSE
## [1] 985.2607